Which data bases dplyr can work with? Five commmon ones: RMySQL, RPostgreSQL, SQLite database, odbc and bigrquery.
What is bigrquery and how to work with it
#library(bigrquery)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(stringr)
https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew/data
df = read.csv("Taxi_Trips.csv")
df
Q: After you have data Make table with day of the week in the column name, Length of a trip in vertical (you will have to cut it), and average tip in cells
df$Tips <- str_replace_all(df$Tips, "\\$", "") %>%
as.numeric()
table_tips <- df %>%
select(Trip.Start.Timestamp, Trip.End.Timestamp, Tips) %>%
mutate(Start_date = mdy_hms(Trip.Start.Timestamp)) %>%
mutate(wkday = wday(Start_date, label = TRUE)) %>%
separate(Start_date, c("st_date","st_time"), sep = " ") %>%
separate(st_time, c("st_hr","st_min","st_sc"), sep = ":") %>%
mutate(End_date = mdy_hms(Trip.End.Timestamp)) %>%
separate(End_date, c("ed_date","ed_time"), sep = " ") %>%
separate(ed_time, c("ed_hr","ed_min","ed_sc"), sep = ":") %>%
mutate(hr = as.numeric(ed_hr) - as.numeric(st_hr)) %>%
mutate(min = as.numeric(ed_min) - as.numeric(st_min)) %>%
filter(hr>0) %>%
mutate(Duration = hr*60 + min) %>%
select(wkday,Duration,Tips)
table_tips
table_tips_wkday <- table_tips %>%
group_by(wkday,Duration) %>%
summarise(avg_tip = mean(Tips, na.rm = T)) %>%
spread(wkday,avg_tip)
table_tips_wkday
The ones with tip = 0 influences the result. Not sure if they are correct data.
table_tips_cuttime <- table_tips %>%
group_by(wkday,Duration_interval = cut_width(Duration, 120, boundary = 0)) %>%
summarise(avg_tip = mean(Tips)) %>%
spread(wkday,avg_tip)
table_tips_cuttime